################################
#	John Henderson
#	Gerrymandering Incumbency 
#		(with Brian Hamel and Aaron Goldzimer)
#		January 1, 2018
#
################################
# analysisST :: function that produces distributions of moments for alternative plans by state and chamber

analysisST=function(state=state,chamber=chamber,flips=NULL,m0=1000,m1=1000000,path='~/Dropbox/StateRedistricting/replication/short/plans2010/'){
	set.seed(1005)                                                        	
	library(foreign) 
	library(mice)  
	library(matrixStats)

	if(is.null(flips)){
		flips=F
	}         

	fold_length=0
	imputeNAplans=function(dists,plans=NULL,impute=T){

		# imputing missing cells for partial plans :: synthetic alternative maps
		# can't just align districts by number, since these have geographical 
		# definition; and don't have geographical boundaries for non-districts
		# in shapefiles, etc. 

		# hard and fast rule: sort districts on the pr(Obama 08) for whole plans, 
		# then match existing districts in partial plans to those that reduce the k-plan
		# distances
		if(is.null(plans)){
			warning('No index for plans is included: care is required to map back onto the various alternatives.')
		}   
		# length of datas
		m=length(dists)
		n=nrow(dists[[1]])  
		# length of completes
		cm=0
		#mu=array(NA,m)  
		is.cm=array(FALSE,m)  
		ord.dists=list() 
		ord.mat=matrix(NA,n,length(dists))
		for(i in 1:m){
			if(all(complete.cases(dists[[i]]))){
				cm=cm+1
				is.cm[i]=T
				mu=dists[[i]][,1]/(dists[[i]][,1]+dists[[i]][,2])
				ord.dists[[i]]=dists[[i]][order(mu),]

				ord.mat[,i]=t(t(sort(mu)))
			}
		}   

		for(i in 1:m){ 

			if(is.cm[i]==F){
				mm=1:n
				mm=mm[order(dists[[i]][,1]/(dists[[i]][,1]+dists[[i]][,2]),na.last=T)]
				m0=sort(dists[[i]][,1]/(dists[[i]][,1]+dists[[i]][,2]),na.last=T)
				n0=as.numeric(names(m0)) 
				if(is.null(names(dists[[i]][,1]))|is.null(names(dists[[i]][,2]))){ 
					n0=as.numeric(mm)				
				}
				n1=array(NA,n)
				for(p in 1:length(m0)){ 
					if(is.na(m0[p])){
						break()
					}
					dd=sqrt(rowSums((ord.mat-m0[p])^2,na.rm=T)) 
					if(length(which(!is.na(n1)))>0){
						dd[1:max(which(!is.na(n1)))]=NA
					}
					n1[which(dd==min(dd,na.rm=T))]=n0[p]

					#if dd object is approaching length(m0) *too fast*
					if(p<length(m0[!is.na(m0)]) & max(which(is.na(n1)))<length(m0)){
						n1=c(n1[-c(max(which(is.na(n1))))],NA)
					} 			      			
				}
				n1[which(is.na(n1))]=sample(n0[which(is.na(m0))],replace=F)	
				ord.dists[[i]]=dists[[i]][n1,]		   		
			}
		}

		# quasi-randomization simulation : plans that have missing districts
		#  will have missings imputed for *all* the other plans that have those 
		#  districts : average district bias will be recorded ... 
		# complete plans are analyzed as is
		dist_insa=matrix(NA,n,1)
		dist_ins=matrix(NA,n,m) 
		colnames(dist_ins)=plans
		for(i in 1:m){  
			if(is.cm[i]==F){
				qn=ord.dists[[i]][,1]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])-ord.dists[[i]][,2]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])			
			  	mat_temp=matrix(qn,n,sum(is.cm))
				jj=0
				for(j in which(is.cm)){
					jj=jj+1
					qs=ord.dists[[j]][,1]/(ord.dists[[j]][,1]+ord.dists[[j]][,2])-ord.dists[[j]][,2]/(ord.dists[[j]][,1]+ord.dists[[j]][,2])							
					mat_temp[which(is.na(qn)),jj]=qs[which(is.na(qn))]				
				}                                                     
				colnames(mat_temp)=paste(plans[i],'sim',1:jj,sep='_')
				if(mean(complete.cases(dist_insa))==0){
					dist_insa=mat_temp
				} else {
					dist_insa=cbind(dist_insa,mat_temp)
				}
			} else {
				dist_ins[,i]=ord.dists[[i]][,1]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])-ord.dists[[i]][,2]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])			
			}
		} 
		if(mean(is.cm)<1){
			dist_ins=cbind(dist_ins[,which(is.cm==T)],dist_insa)
		}        

		return(dist_ins) 
	}


	# averages over simulations 
	avgSims=function(k_get,adopted=adopted,discard=F){
		library(stringr)
		nms=names(k_get)
		if(is.null(nms)){
			nms=colnames(k_get)
		} 
		if(is.null(nms)){
			warning('no names to indicate simulations')
			return(k_get)
		}
	    if(discard==T){
			return(k_get[-c(grep(nms,pattern='_sim_'))])
		} else if(discard==F){
			kk=k_get[-c(grep(nms,pattern='_sim_'))]
			gg=k_get[c(grep(nms,pattern='_sim_'))]
			nms_kk=nms[-c(grep(nms,pattern='_sim_'))]
			nms_gg=nms[c(grep(nms,pattern='_sim_'))]  

			idx=str_sub(nms_gg,1,str_locate(nms_gg,pattern='_sim')[,1]-1)		
			return(c(kk,tapply(gg,idx,mean,na.rm=T)))
		}
	}  
	
	# these states state senate maps are also assembly maps
	if((state=='AK' | state=='AZ' | state=='ID' | state=='SC' | state=='WA') & chamber=='SD'){
		chamber='AD'
	}  
	
	#files=paste(path,state,'/',chamber,'/',chamber,'_dists_list.Rdata',sep='')	
	files=paste(path,state,'_',chamber,'_dists_list.Rdata',sep='')	
	load(files) # dists + folders 
	
	if(state=='FL'){
		folders=gsub(folders,pattern='H000C9057_adopted',replace='H000C9057')
	}                                                             
	adopted=grep(folders,pattern='adopt')                                                                          
         
	m=nrow(dists[[adopted[1]]])

	k=length(dists)
	k_len=array(NA,k)
	for(i in 1:k){
		ip=c() 
		if(state=='CA'){
			# a couple districts w/ more than 1mil is given as remainder of the map and should be removed
			dists[[i]]=rbind(dists[[i]],0)
			ip=c()
			ip=c(ip,unique(c(which(dists[[i]][,1]<1000 | is.na(dists[[i]][,1])),which(dists[[i]][,2]<1000 | is.na(dists[[i]][,2])))))  
			ip=c(ip,which(rowSums(dists[[i]])>1000000)) 
		}
		
		if(length(ip)>0 & i>1){
			dists[[i]]=(dists[[i]][-c(ip),])
		}
		if(is.null(nrow(dists[[i]]))){
			dists[[i]]=t(dists[[i]])
		}
		k_len[i]=nrow(dists[[i]]) 							
	}
	ks=which(k_len<max(k_len))                     
      
    set.seed(1005)
	for(i in 1:length(dists)){
		if(nrow(dists[[i]])<m){
			nm=m-nrow(dists[[i]])
			temps=matrix(NA,nm,ncol(dists[[i]]))
			dists[[i]]=rbind(dists[[i]],temps)		
		} else if(nrow(dists[[i]])>m){
			dists[[i]]=dists[[i]][sample(1:nrow(dists[[i]]),size=m,replace=F),]
		} 
		dists[[i]]=as.data.frame(dists[[i]])     
		rownames(dists[[i]])[1:m]=1:m
	}
	
	folders=c(gsub(folders,pattern=paste(chamber,'_plans/',sep=''),replace=''))		
	f_adopted=adopted
		
	outs=imputeNAplans(dists,plans=folders) 
	            
	if(flips==F){ 
		k_get=colMeans(abs(outs))     	
		adopted=grep(names(k_get),pattern='adopt')			
	} else if(flips==T){
		k_get=t(-outs[order(rowMeans(-outs)),])
		adopted=grep(rownames(k_get),pattern='adopt')  			
	}   
	ap=outs[,c(agrep(colnames(outs),pattern='adopt'))]
	op=outs[,-c(grep(colnames(outs),pattern='adopt'))]                           

	return(list('ap'=ap,'op'=op,'adopted'=adopted,'k_get'=k_get,'outs'=outs,'folders'=folders,'f_adopted'=f_adopted))
}
#end